{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 基础包与数据导入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 6028 entries, 0 to 6027\n",
      "Data columns (total 7 columns):\n",
      "house_id        6028 non-null int64\n",
      "neighborhood    6028 non-null object\n",
      "area            6028 non-null int64\n",
      "bedrooms        6028 non-null int64\n",
      "bathrooms       6028 non-null int64\n",
      "style           6028 non-null object\n",
      "price           6028 non-null int64\n",
      "dtypes: int64(5), object(2)\n",
      "memory usage: 329.7+ KB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>house_id</th>\n",
       "      <th>neighborhood</th>\n",
       "      <th>area</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>style</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1112</td>\n",
       "      <td>B</td>\n",
       "      <td>1188</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>ranch</td>\n",
       "      <td>598291</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>491</td>\n",
       "      <td>B</td>\n",
       "      <td>3512</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>victorian</td>\n",
       "      <td>1744259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5952</td>\n",
       "      <td>B</td>\n",
       "      <td>1134</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>ranch</td>\n",
       "      <td>571669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3525</td>\n",
       "      <td>A</td>\n",
       "      <td>1940</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>ranch</td>\n",
       "      <td>493675</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5108</td>\n",
       "      <td>B</td>\n",
       "      <td>2208</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>victorian</td>\n",
       "      <td>1101539</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   house_id neighborhood  area  bedrooms  bathrooms      style    price\n",
       "0      1112            B  1188         3          2      ranch   598291\n",
       "1       491            B  3512         5          3  victorian  1744259\n",
       "2      5952            B  1134         3          2      ranch   571669\n",
       "3      3525            A  1940         4          2      ranch   493675\n",
       "4      5108            B  2208         6          4  victorian  1101539"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('house_prices.csv')\n",
    "df.info(); df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 变量探索\n",
    "数据质量和整洁度都还不错，毕竟是经过基础评估与清洗的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 异常值处理\n",
    "# ================ 异常值检验函数：iqr & z分数 两种方法 =========================\n",
    "def outlier_test(data, column, method=None, z=2):\n",
    "    \"\"\" 以某列为依据，使用 上下截断点法 检测异常值(索引) \"\"\"\n",
    "    \"\"\" \n",
    "    full_data: 完整数据\n",
    "    column: full_data 中的指定行，格式 'x' 带引号\n",
    "    return 可选; outlier: 异常值数据框 \n",
    "    upper: 上截断点;  lower: 下截断点\n",
    "    method：检验异常值的方法（可选, 默认的 None 为上下截断点法），\n",
    "            选 Z 方法时，Z 默认为 2\n",
    "    \"\"\"\n",
    "    # ================== 上下截断点法检验异常值 ==============================\n",
    "    if method == None:\n",
    "        print(f'以 {column} 列为依据，使用 上下截断点法(iqr) 检测异常值...')\n",
    "        print('=' * 70)\n",
    "        # 四分位点；这里调用函数会存在异常\n",
    "        column_iqr = np.quantile(data[column], 0.75) - np.quantile(data[column], 0.25)\n",
    "        # 1，3 分位数\n",
    "        (q1, q3) = np.quantile(data[column], 0.25), np.quantile(data[column], 0.75)\n",
    "        # 计算上下截断点\n",
    "        upper, lower = (q3 + 1.5 * column_iqr), (q1 - 1.5 * column_iqr)\n",
    "        # 检测异常值\n",
    "        outlier = data[(data[column] <= lower) | (data[column] >= upper)]\n",
    "        print(f'第一分位数: {q1}, 第三分位数：{q3}, 四分位极差：{column_iqr}')\n",
    "        print(f\"上截断点：{upper}, 下截断点：{lower}\")\n",
    "        return outlier, upper, lower\n",
    "    # ===================== Z 分数检验异常值 ==========================\n",
    "    if method == 'z':\n",
    "        \"\"\" 以某列为依据，传入数据与希望分段的 z 分数点，返回异常值索引与所在数据框 \"\"\"\n",
    "        \"\"\" \n",
    "        params\n",
    "        data: 完整数据\n",
    "        column: 指定的检测列\n",
    "        z: Z分位数, 默认为2，根据 z分数-正态曲线表，可知取左右两端的 2%，\n",
    "           根据您 z 分数的正负设置。也可以任意更改，知道任意顶端百分比的数据集合\n",
    "        \"\"\"\n",
    "        print(f'以 {column} 列为依据，使用 Z 分数法，z 分位数取 {z} 来检测异常值...')\n",
    "        print('=' * 70)\n",
    "        # 计算两个 Z 分数的数值点\n",
    "        mean, std = np.mean(data[column]), np.std(data[column])\n",
    "        upper, lower = (mean + z * std), (mean - z * std)\n",
    "        print(f\"取 {z} 个 Z分数：大于 {upper} 或小于 {lower} 的即可被视为异常值。\")\n",
    "        print('=' * 70)\n",
    "        # 检测异常值\n",
    "        outlier = data[(data[column] <= lower) | (data[column] >= upper)]\n",
    "        return outlier, upper, lower"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "以 price 列为依据，使用 Z 分数法，z 分位数取 2 来检测异常值...\n",
      "======================================================================\n",
      "取 2 个 Z分数：大于 1801467.1287622033 或小于 -293051.3610117055 的即可被视为异常值。\n",
      "======================================================================\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 335 entries, 22 to 6018\n",
      "Data columns (total 7 columns):\n",
      "house_id        335 non-null int64\n",
      "neighborhood    335 non-null object\n",
      "area            335 non-null int64\n",
      "bedrooms        335 non-null int64\n",
      "bathrooms       335 non-null int64\n",
      "style           335 non-null object\n",
      "price           335 non-null int64\n",
      "dtypes: int64(5), object(2)\n",
      "memory usage: 20.9+ KB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>house_id</th>\n",
       "      <th>neighborhood</th>\n",
       "      <th>area</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>style</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2345</th>\n",
       "      <td>6876</td>\n",
       "      <td>B</td>\n",
       "      <td>4084</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>victorian</td>\n",
       "      <td>2026407</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1608</th>\n",
       "      <td>7330</td>\n",
       "      <td>B</td>\n",
       "      <td>4859</td>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>victorian</td>\n",
       "      <td>2408566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>357</th>\n",
       "      <td>2005</td>\n",
       "      <td>B</td>\n",
       "      <td>3735</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>victorian</td>\n",
       "      <td>1854350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5652</th>\n",
       "      <td>3258</td>\n",
       "      <td>B</td>\n",
       "      <td>4330</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>victorian</td>\n",
       "      <td>2147685</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2211</th>\n",
       "      <td>2430</td>\n",
       "      <td>B</td>\n",
       "      <td>6922</td>\n",
       "      <td>8</td>\n",
       "      <td>5</td>\n",
       "      <td>victorian</td>\n",
       "      <td>3425777</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      house_id neighborhood  area  bedrooms  bathrooms      style    price\n",
       "2345      6876            B  4084         6          4  victorian  2026407\n",
       "1608      7330            B  4859         7          4  victorian  2408566\n",
       "357       2005            B  3735         6          4  victorian  1854350\n",
       "5652      3258            B  4330         6          4  victorian  2147685\n",
       "2211      2430            B  6922         8          5  victorian  3425777"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outlier, upper, lower = outlier_test(data=df, column='price', method='z')\n",
    "outlier.info(); outlier.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这里简单的丢弃即可\n",
    "df.drop(index=outlier.index, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "neighborhood :\n",
      "                 B     A     C\n",
      "value_counts  2093  1875  1725\n",
      "===================================\n",
      "style :\n",
      "              victorian  ranch  lodge\n",
      "value_counts       2674   1790   1229\n",
      "===================================\n"
     ]
    }
   ],
   "source": [
    "# 类别变量，又称为名义变量，nominal variables\n",
    "nominal_vars = ['neighborhood', 'style']\n",
    "\n",
    "for each in nominal_vars:\n",
    "    print(each, ':')\n",
    "    print(df[each].agg(['value_counts']).T)\n",
    "    # 直接 .value_counts().T 无法实现下面的效果\n",
    "     ## 必须得 agg，而且里面的中括号 [] 也不能少\n",
    "    print('='*35)\n",
    "    # 发现各类别的数量也都还可以，为下面的方差分析做准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 热力图 \n",
    "def heatmap(data, method='pearson', camp='RdYlGn', figsize=(10 ,8)):\n",
    "    \"\"\"\n",
    "    data: 整份数据\n",
    "    method：默认为 pearson 系数\n",
    "    camp：默认为：RdYlGn-红黄蓝；YlGnBu-黄绿蓝；Blues/Greens 也是不错的选择\n",
    "    figsize: 默认为 10，8\n",
    "    \"\"\"\n",
    "    ## 消除斜对角颜色重复的色块\n",
    "    #     mask = np.zeros_like(df2.corr())\n",
    "    #     mask[np.tril_indices_from(mask)] = True\n",
    "    plt.figure(figsize=figsize, dpi= 80)\n",
    "    sns.heatmap(data.corr(method=method), \\\n",
    "                xticklabels=data.corr(method=method).columns, \\\n",
    "                yticklabels=data.corr(method=method).columns, cmap=camp, \\\n",
    "                center=0, annot=True)\n",
    "    # 要想实现只是留下对角线一半的效果，括号内的参数可以加上 mask=mask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAFUCAYAAADPtPD/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAMTQAADE0B0s6tTgAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdd3gU1frA8e+bCiGEEkoSQm8iiHREaZYrXKqAIijXxhX0p6CiIohKUbFeuGBB7KhcFaV3BSwgRYoggoAoERISSgqQRAIJ5/fHbmI2m5CdsJvNJu/neeZJZubM7HvYZU/ec87MiDEGpZRSKjc/bweglFKq5NHGQSmllBNtHJRSSjnRxkEppZQTbRyUUko50cZBKaWUE20clFJKOdHGQSmlfJCIzBSRGBExItLiIuWeEpHf7cuzrp5fGwellPJNXwKdgT8LKiAiXYGhQEvgcuCfItLDlZNr46CUUj7IGPO9MSa2kGK3Ah8aY9KMMRnA+9gai0IFXGqAVsj9V5WZe3VcmPWCt0MoNsZc8HYIykNEytbfj8K14rZzXcr33VtbHgXG5NoyzRgzrQhnqgN8l2s9BrjZlQOLtXFQSilVOHtDUJTGIN/T5frd5cavbP1ZoJRSZcthoF6u9br2bYXSxkEppUqvL4A7RaSCiAQD9wCfuXKgNg5KKeUB4idFXlw6v8gbIhILRANrROSgffsKEWkHYIz5FpgH7AZ+Bb4yxqxy5fw65qCUUj7IGPMA8EA+23vlWZ8CTLF6fs0clFJKOdHMQSmlPMDV7qGSSjMHpZRSTjRzUEopD9DMQSmlVKmjmYNSSnmAZg5KKaVKHW0clFJKOdFuJaWU8gAR7VZSSilVymjmoJRSHqAD0koppUodbRyUUko50W4lpZTyAO1WUkopVepo5qCUUh6gmYNSSqlSRzMHpZTyAM0clFJKlTraOCillHJSaruVZgweQ7+WXagXHkmLZ29jz9E/vB3SRcXEHGPcuDkkJ6cSVrE8L7x4J40aRTmVm/XmChYs2AhAnz4deOjhfi7tA0hKOkPfPlNo264RM2eOBCAj4zwTJ85lz57DYCA6uhpTp95Blaqhnqqqk5iY44wfN4fk5DTCwsoz9YU7aNQo0qncrFkrWbhgEwC9+7TnoYf6ArB7dwxTn/+Cffti6dq1BTNm3ptzzMX2eYsn65stKekM/fo+R9u2jYq9zt76LL/99ipWLN+WU+bIkZPcfMs1jB9/i9vr6ArtViqhvtyxjs6vjiAmMd7bobhk4jP/Y/DgzqxePYXh/76RCRM+diqzdetvLF++lcVLnmb5iol8//0vrF+/p9B92SZP+pSu3Vo4bPv8s+9JT89gyZKnWbrsGapVq8i77672XEXzMWmire6rVk9i+PB/8NSET5zK2Oq3jUWLJ7Bs+dOs/34PG9bvBaB69UqMf/IWxo272em4i+3zFk/WN9uUyZ/RtWtzj9XhYrz1WR4xoieLFj/FosVPMe+LcQQG+tO3bwfPVbSUK7WNw/qDO4lLOeHtMFySmHiavXsP069fRwB69GhDXGwisbEnHcqtXLGNAQM6ERISTFBQIIMGXc3y5VsL3QewdMkWwqtVpH37xk6vf/avc5w/n0VmZhZp6RnUjKjiwdo6Skw8w969R+jbz/af+MYerYmLSyQuNtGh3MqV2xkw4Kqc+g0c1Inl9r8SIyKq0LJlPYKCnBPhi+3zBk/XF2Dp0h8JDw/L9732NG9/lrOtXbOTiIgqtGhR1wO1dI34SZGXksClxkFEel1s8XSQpV18fDI1alQiIMAfsN3qNzKyCvHxyQ7ljsYnERVVNWe9VnQ48UeTC9137FgKH3y4lkcfHeD02rcO6UpoaHmuufpxrrlmLKln/mLYsO7urmKBEgqo+9H4JIdy8Ufz1K9WOPF5yvgCT9f3+LEUPvxgLWMe7e/ewF3kzc9ybl/O38igm69xS53KKlf/nHrc/rMc0B7YbV+/AtgMrHBzXGVO3nu/G1N4ubxlCtr3zNOf8PjjA6lQoZzT+TZu/BUENvzwMiLC+PFzeOON5Ywa1dd6JYrKqe75V96xfgX8A/kCD9b36Wfm8tjjA/J9r4uLtz7L2eLjk9ix/SD/+c9w14P2gJKSARSVS42DMeZaABH5BHjYGLPFvt4B+HdBx4nIGGBMzoZOdeBK54G3si4ysgoJCclkZmYREOCPMYaEhGQiIx27d6IiqxIX93f3w9G4RCKjqhS6b+fOP5gw4SMA0tMyyMg4z/DhM3nvvdF89tl6burfkeDgQAD69u3Au+9+xahRHq1yjojIKhxzqnsKUZFVHcpFRuWp39EkIvOU8QWeru+unYdyxjDS023v9b+Hv8a77xXPG+rNz3K2BfM3ce11LalcuYLH6lkWWB1zuCy7YQAwxvwItCmosDFmmjEmOnvRhiF/4eFhNLu8NkuW2P5pV6/eQa1a4URHV3Mo16NnGxYt2kx6egbnzp1n/vyN9O7VvtB9W36cxrp1U1m3bipjnxhEl67Nc/4z1a5djQ0b9mKMwRjDt9/spklj55klnqt7RZo1q83SJT8C8NXqn4iqVZVa0eEO5Xr2cKzfgvmb6NW7bbHF6S6eru/mLa+ydt1zrF33HGPHDqRLl+bF1jCAdz/LYMuwFi7cxM3apXTJrI7SZYrIMGPMJwAiMgzIdH9Yl+71IY/Rv2VXIsKqsmb0a6RmpNN4onemtLli8uTbGT9+DrNnryK0QjlefOkuAEbc+xqjRvfjiivq0rFjU3r2bEu/vs8C0Kt3O7rYZ6RcbN/FPPhgH5555hP69J6MiNCwYSSTp9zumUoWYPLk2xg//iNmz15NaGg5XnjxDgBGjHiD0aP60OKKunTo2ISePdvSv99zAPTq1Y4uXWz1O3z4BHf8azpnz54jI+M83bs9yYiRPbjttm4X3ectnqxvSeCtzzLA5s37McbQqdNl7q+YRb7+mFCx0ncrIs2Aj4HmgAF+Ae40xvzq0vH3X+XDHcXWXJj1grdDKDbGXPB2CMpDRErthMZ8Cde67Rs9bPI/ivx9d3ri115vWSxlDvZGoJ2IVLSvn/FIVEop5ePKxIC0iNQ3xhwSkcvzbAfAGLPXA7EppZTyElczh9eAPsDyfPYZoIHbIlJKqVKgTGQOxpg+9p/1L1ZORK4xxvzgjsCUUkp5j7tHm15z8/mUUkp5gbtvOOPbeZRSSrmJr3cruTtzKDNTVZVSqjQrGbeqVEqpUkYzB0e+/a+hlFIKKGLmICIBxpj8bpvx+iXGo5RSpUKZyhxEpLmI7AQO2dfbishL2fuNMe+5OT6llFJeYLVb6XXgQSD7sU47gN5ujUgppZTXWe1WqmiM2ZDrthlGRM67PyyllPJtZapbCdstuwOxT1kVkWhAb8mplFKljNXM4XVgIVBNRCYBdwBPujsopZTydb6eOVi9ZfcnIvIH0B8IwfYsh/UeiUwppZTXWJ7KaozZCGwUkUpAbfeHpJRSytusTmVdJSKVRSQU2AUsE5EpnglNKaV8l/hJkZeSwOqAdE1jTArQC1gMNAZucntUSimlvMpqt1Kg/WdXYJUx5ryI6GwlpZTKI3vKv6+ymjn8IiKrsD0Vbp2IhHggJqWUUl5mNXO4C+gJ7DLGpItILWCc26NSSikfV1LGDorK6lTWs8CiXOtxQJy7g1JKKeVdVmcrXRCRrLyLp4JTSilVMBFpLCIbReSAiPwoIpfnU6aciHwoIrtF5BcRWSIi1Qo7t9Uxh4pAmH2pCTwOTLB4DqWUKvWKaSrrbOBtY0wT4GUgvztjjwRCgZbGmBbAMWBsYSe22q2Ulms1DZgmIt8CL7py/IVZL1h5OZ/md/94b4dQbEJrhno7hGLl5+/uZ2SVXBVrVvB2CMUqdsS13g7BZSJSA2gD3GjfNB94XUTqGWNi8hQPAQLts0tDgd2Fnf+SPuUi0hi9SloppZxcSuYgImNEJDbXMiafl6gNHM1+8JoxxgCHgTp5ys0GTgPHsWUNlXDhwWyWMgcROYH9jqz2Y/2B0VbOoZRS6uKMMdOAaa4UzbOeX5/UDfZyEdjuov0h8Aww6WIntjqVtV2u3zOBBGOMDkgrpVQefp7vfTwCRGc/tllsV93VxpY95HYf8JF9tikiMhfbmMOki53cUvjGmD+xpSYRQC0gyMrxSiml3MMYcxz4CRhm3zQIiMlnvOEPoIfYYbuI+ZfCzm91KuvVwO/AW8DbwEER6WTlHEoppdxmJDBSRA5guyB5OICIrBCR7J6eSdjGGfZgaxSqAU8XdmKr3UrTgFuMMT/YA7gamA5cZfE8SilVqvkXw72VjDH7Aac/0I0xvXL9ngTcbPXcVnvFymU3DPYX3QiUs/qiSimlSjarmUO6iNxgjFkDICLdgXS3R6WUUj7OvyzdWwnbtNX5IpKBbWpUMLZBEKWUUqWI1Sukt4lII6Aptvm0+4wx5z0SmVJK+bDiGHPwJMvPkAaygCT7sZEigjEm77xapZRSPszqFdJ3ATOB89iutANb91IN94allFLKm6xmDk8DHYwx+zwRjFJKlRa+fn9Gq+Gf0IZBKaVKP5cyh1zPil4gIg8C/wPOZu83xuh0VqWUyqWsDEinYhtbyK7tzFzrBtvdWZVSSpUSLjUOxhgf7z1TSqni5euZg37pK6WUcqKNg1JKKSdFuQhOKaVUIXz93kqaOSillHKimYNSSnmAv28nDpo5KKWUcqaNg1JKKSfaraSUUh6gA9JKKaVKnRKdOcTEHGPcuDkkJ6cSVrE8L7x4J40aRTmVm/XmChYs2AhAnz4deOjhfi7tA0hKOkPfPlNo264RM2eOBCAj4zwTJ85lz57DYCA6uhpTp95BlaqhnqrqJZsxeAz9WnahXngkLZ69jT1H//B2SG7RsGotZg8YS3hIJVLOpnLfopfZf8Lx8SEhgeX4T68HaR3VlCD/AJbu+4GJa971UsTWNKgaxVv9Hic8pBKnzqZy/5JX2X/SsX7B/oFM7/0QrSIbIwgxKfE8sOQ/JP11GoBnr7+XfzRuT9aFCyT9dZqHlv2XP5KPeqM6haofFsn07qOpWi6M0+fSeOTbmfyWEutQ5oErB9KvYeec9TphNfl03xqmbP4AQZjQ8Q66125DgJ8f2xL2MX7DbM5fyCzuqhRKr5D2oInP/I/BgzuzevUUhv/7RiZM+NipzNatv7F8+VYWL3ma5Ssm8v33v7B+/Z5C92WbPOlTunZr4bDt88++Jz09gyVLnmbpsmeoVq0i77672nMVdYMvd6yj86sjiEmM93YobjWj78N8sH05rV+7i//+8Dlv9nvMqcxjXYYCcNWse+nw5r9pGdGImy7vWtyhFsmMXg/z4Y4VtH3zHmZs+oLX+45xKnN3296EBpXn6tkj6TR7BCdSk3n46sEA9GrSiavrXkHnt+/nmrfv47tDO3nmuruLuxoue7HL/czd9xVd5z3ArF0LebXbg05l3ti1gB4LxtBjwRj6LBrL+QuZLDz4HQBDL7uBZuH1+OeCR+k+bxQAw1v0KdY6lBUltnFITDzN3r2H6devIwA9erQhLjaR2NiTDuVWrtjGgAGdCAkJJigokEGDrmb58q2F7gNYumQL4dUq0r59Y6fXP/vXOc6fzyIzM4u09AxqRlTxYG0v3fqDO4lLOeHtMNyqWoXKXBnZmM9+XgPA4r3rqVslgjqVazqUuyKiIV8dtL2vmReyWPf7NoZeeUOxx2tVtZDKtIxsxOe71wKw+Nf11K0cQZ1KNZ3Klg8IJtAvAH/xo0JQeeJO//3/IMg/kHIBQQCEBYc47CtJwstVokW1Biz4zfZFv/zQJmpXrEF0aPUCj+lRrwPxqYnsPmnLhC8Pr8eGuF05mcK6I9sZ1Lib54MvAn8/KfJSEhSpcRCRABEJyV7cHRRAfHwyNWpUIiDAP/s1iYysQnx8skO5o/FJREVVzVmvFR1O/NHkQvcdO5bCBx+u5dFHBzi99q1DuhIaWp5rrn6ca64ZS+qZvxg2rLu7q6gKER1WnYQziWRduJCz7cip49Su5Pjgwe1x+xnYvBuB/gGEBpWnb7PO1KkcUdzhWpZTP/N3/WJPHSc6T/0+2L6cM+fSOfjoPA6OmUdYcAXe3roYgJUHNrM+ZhcHHvmcA498Rrf6rZn67ZxirYerokLDOZae5FDfo6knqXWRxmFI0xv4bP+anPWdxw9yY90OVAgsR6BfAP0adiG6oj6I0hMsNQ4i0kFEdmN7lsOZXEtB5ceISGz2Mm3aF5aCkzx9dsYUXi5vmYL2PfP0Jzz++EAqVCjndL6NG38FgQ0/vMz69S9RMSyEN95Ybil25R4mzxsqOP9VNf2Hz4g9dYJv732dz4c+y5YjezifVfL6oPNjyFO/fPqpuzdogzGGJtNupcn0IZzKSOWJrsMAaBXZiCbVatPsv0NpOn0o3x36iVf+6dxVU1LkfT8vJrJCOB0imrHw4Pc527787Ru+i93J/L7P83mfKRxIPkzmhSxPhFrmWR2Qngn8G3gL6AqMBv4qqLAxZhowLWedb1z+ZERGViEhIZnMzCwCAvwxxpCQkExkpGP3TlRkVeLiEnPWj8YlEhlVpdB9O3f+wYQJHwGQnpZBRsZ5hg+fyXvvjeazz9ZzU/+OBAcHAtC3bwfeffcrRo1yNXrlDrGnTxAVVh1/P7+c7CG6UnWOnDruUC4j8zzjV8/KWR/TeQj7TvxZrLEWRezpE0RVrI6/+OX8NV0rrDqxeep3T5vefLZ7DRlZ5wGYt3sdD109mBe//5jbrryR9TE7OZWRBsD/fv6aL4Y8V7wVcdHR1EQiQ8Md6hsVWo241Py7Q29tej1f/bmVlIxUh+3/3TGP/+6YB0C/hp05kHzEs4EXUVm7QjrQGLMFCDDGnDHGPA/0K+ygoggPD6PZ5bVZsmQLAKtX76BWrXCio6s5lOvRsw2LFm0mPT2Dc+fOM3/+Rnr3al/ovi0/TmPduqmsWzeVsU8MokvX5rz33mgAateuxoYNezHGYIzh229206Sx8ywp5Vkn01L4OeEgQ1raxg/6X96FwynHOJxyzKFcxeAQygcGA1C3cgTD2/XltU1fFnu8Vp1Mt9Xv1iuuB6B/M3v9TjnWLyYlgesbtMtZ79m4I78ej7HtS06gW/3WBPjZul//2fgqfj0RUyzxW5V49hR7Th5ioH2MoHf9TsSeOU5sAY3DLU2udehSAtvMrbAgW092leCKPHDlQGbtWujZwMsoq5lDdq6eKCKtgFigrntD+tvkybczfvwcZs9eRWiFcrz40l0AjLj3NUaN7scVV9SlY8em9OzZln59nwWgV+92dOnaHOCi+y7mwQf78Mwzn9Cn92REhIYNI5k85XbPVNJNXh/yGP1bdiUirCprRr9GakY6jSfe4u2wLtlDS6fz1k1jeazLbZzOSGPkwpcB+PL253n+mzn8dPQA9apE8tEtT5N5IYvMC1mMXz2L3Qm/ezly1zy8Ygaz+j3Go52HciYjnfuWvALAF0OeY+p3c/gp/jde/O5jZvR5mC33vYPBsP/EYR5ePgOAd7YtoWm12mwa+Tbnss5zLDUpZ19J9MT6WUzvPppRrW7mzPl0Hvl2JgAf9XyKV7d9ys8nbe/bNVFXIAgb4n52OL5iUAhf9n2OrAsX8Pfz493dy1hzeFux18MVJWVguajESh+giDwCfAS0Bb7E1rg8Y4x51ZXjrXQr+Tq/+8d7O4RiE1qz5F7/4Ql+/iV2kp/bVaxZwdshFKvYEQvd9o1+/fzbi/x9t3bQXK+3LJYyB2PMdPuvX4lIOFDOGFPggLRSSpVVZeoiOBHxF5GHROR1Y8x5oIaIXOeh2JRSSnmJ1TGH14BAIPva9kTgM6C9O4NSSinlXVYbh6uNMa1E5CcAY0yKiAR5IC6llPJpZapbCdvFbzlExL8I51BKKVXCWc0cfhaR2wERkXrAeOD7ix6hlFJlkK9ParMa/hhsV0ZHAlvsx491d1BKKaW8y+XMwd6F1MMYMxIY6bmQlFLK95WZMQdjTBa2zEEppVQpZ7VbaZuIdPJIJEoppUoMqwPSXYEHROQAkAoIYIwxHdwemVJK+TBfv7eS1cbh4Xy2lZn7JSmlVFlhtXHYA0wErgRyPyVHMwellMqlzAxI272P7TbdEcCzwHFgtbuDUkop5V1WG4c6xpiXgLPGmKXAQOBq94ellFLKm6x2K52z/8wQkapAChDt3pCUUsr3+foV0lYbh/32RuETYDNwCvjJ7VEppZTyKqsP+/mX/dcZIrINqAKsdHtUSinl43x9QNpq5pDDGPODOwNRSilVchS5cVBKKVUwX78IzseHTJRSSnmCNg5KKaWcaOOglFIe4C9S5MVVItJYRDaKyAER+VFELi+gXDcR2Soie0Rknys3UNUxB6WU8l2zgbeNMR+KyM3Ae4DDF7+IRAFzgH8aY34VkXI43v4oX5o5KKWUB/j7FX1xhYjUANpgu+4MYD5Q3/4I59z+D/jEGPMrgDHmrDEmpbDzF2vmYMyF4nw5rwqtGertEIpN6rFUb4dQrPyD/L0dQrFpUL+Kt0NQBasNHDXGZILt2QkichioA8TkKnc5cEhE1gDVgPXAE8aY9IudXDMHpZTygEsZcxCRMSISm2sp6CmceR+ZkN+ARSDQHbgFaAdUAiYVFr+OOSilVAljjJkGTCuk2BEgWkQCjDGZIiLYsonDecr9CfxkjEkGEJHPgLGFxaCZg1JK+SBjzHFs97YbZt80CIgxxsTkKfo/4FoRCbav9wR2FXZ+bRyUUsoD/KXoiwUjgZH2RzePA4YDiMgKEWkHYIzZCCwFdorIbqA68ExhJ9ZuJaWU8lHGmP3kmbpq394rz/rLwMtWzq2Ng1JKeYCfj9+VVbuVlFJKOdHMQSmlPMDi2EGJo5mDUkopJ9o4KKWUcqLdSkop5QE+/qwfzRyUUko508xBKaU8QAeklVJKlTqaOSillAf4+figg2YOSimlnGjjoJRSyol2KymllAfogLRSSqlSRzMHpZTyAB8fj9bMQSmllDNtHJRSSjnRbiWllPIAHZBWSilV6mjmoJRSHuDrjwn1mcYhJuY448fNITk5jbCw8kx94Q4aNYp0Kjdr1koWLtgEQO8+7Xnoob4A7N4dw9Tnv2Dfvli6dm3BjJn35hxzsX0lUcOqtZg9YCzhIZVIOZvKfYteZv+Jww5lQgLL8Z9eD9I6qilB/gEs3fcDE9e866WI3WvG4DH0a9mFeuGRtHj2NvYc/cPbIRVZo+rRfHD7U4RXqMSpv1K5Z+7z/HosxqFMSFA5Zt48hra1LyPIP5DFu7/jyaVvATCsfU8e7j4kp2x05eqs/30Xt7z/ZHFWw2V1KkYw5er/o3JwRc6cS2fipjf541ScQ5m7m/enR92rc9ZrhdZg0e/r+M/2j3O2BfkF8mmvFzmblcHtK0tmXX2dz3QrTZr4PwYP7syq1ZMYPvwfPDXhE6cyW7f+xvLl21i0eALLlj/N+u/3sGH9XgCqV6/E+CdvYdy4m52Ou9i+kmhG34f5YPtyWr92F//94XPe7PeYU5nHugwF4KpZ99LhzX/TMqIRN13etbhD9Ygvd6yj86sjiEmM93Yol2zW4LG8s3EJlz8/lFfWzuWdoeOdyoz/xx0AtH7pDq58cRitajVhUKtrAfhk6yravXJXzhJ/OpH/bf+qWOtgxYSO9zL/t7XctOQR5uxdwsSr7nMq88GexQxZ8QRDVjzBsFVPcv5CJisObXAo82CrIfx88kBxhV0k/lL0pSTwicYhMfEMe/ceoW+/DgDc2KM1cXGJxMUmOpRbuXI7AwZcRUhIMEFBgQwc1Inly7cBEBFRhZYt6xEU5JwsXWxfSVOtQmWujGzMZz+vAWDx3vXUrRJBnco1HcpdEdGQrw5uBSDzQhbrft/G0CtvKPZ4PWH9wZ3EpZzwdhiXrHpoZVpHN2HuttUALNj1LfXCI6lbNcKhXMtajVi1dzNgey+/3v8jw9r1dDpf+zrNqFmxKkt3r/d88EVQJTiMZlXrs+KQLb41h7cQFVqDyArVCzzm2uj2HE9P4tekQznbWle/jDphESw7VDLrWVpYahxEZIqIVBab5SJyUkQGeSq4bAnxydSoUYmAAP/sOIiMrMLR+CSHcvFHk4iKqpqzXqtWOPF5yvi66LDqJJxJJOvChZxtR04dp3alGg7ltsftZ2DzbgT6BxAaVJ6+zTpTp3JE3tMpL6pduSZHT58k60JWzrYjyceoU8Wxod92+FduaX2d7b0MDuGmll2pG+78Xt59VR/mbltFZq7zlSQRFcI5kZ5Mlvn7s5uQdpLICtUKPOamRtex6PdvctbL+QfzWLs7eX5L6egiLcmsZg79jTEpwA1AJnANMKGgwiIyRkRis5fp074seqR5BneMMQW9ZqFlfF3eegnOeej0Hz4j9tQJvr33dT4f+ixbjuzhfFZmcYWoXOT0Ec1nEPPlNXOJTTnOpjHvsujel9h06Ben97J8YDCDW1/P+5uWeTDaS2fI89m9yKBtzZBwWtdompNpADzSZhjzDqzmxF/JHovRXfyk6EtJYLUfJbvJ7wZ8YYzZf7E31xgzDZiWc7BZW6Rv64jIKhxLSCYzM4uAAH+MMSQkpBAVWdWhXGRUVeLi/u5qOno0icg8ZXxd7OkTRIVVx9/PLyd7iK5UnSOnjjuUy8g8z/jVs3LWx3Qewr4TfxZrrOrijqQcI7pydfz9/HOyh9qVa3A4+ZhDuYzMczy6cGbO+tgbhvFrQoxDmUGtrmXfsT+dBrNLkoS0RGqEhOMvfjnZQ82QcOLTTuZbvn/D7nwXu53T59JytrWq0ZTOtVox4opBBPkHERZUgS/7vMrNy5zH3dSlsZo5pInIOGAI8LWI+AFB7g/LUXh4RZo1q83SJT8C8NXqn4iqVZVa0eEO5Xr2aMOiRZtJT8/g3LnzLJi/iV6923o6vGJ1Mi2FnxMOMqSlbfyg/+VdOJxyjMMpjl8oFYNDKB8YDEDdyhEMb9eX1zZdQuam3O5Eago7Yw9we7seAAy8sjt/JiXwZ1KCQ7nc72W9qpGMvGYA07/5zKHM3R178/7mkp01JGecZn/yIXrV7wLADXU6cjT1BPFp+Y8f9W3QjUUHv3HYduvysfReNFmSffoAACAASURBVIrei0YxbsMMDqYcLrENg79IkZeSwGrmcBfwIDDWGHNMRBoBc90eVT4mT76N8eM/Yvbs1YSGluOFF20zOEaMeIPRo/rQ4oq6dOjYhJ4929K/33MA9OrVji5dmgNw+PAJ7vjXdM6ePUdGxnm6d3uSESN7cNtt3S66ryR6aOl03rppLI91uY3TGWmMXPgyAF/e/jzPfzOHn44eoF6VSD665WkyL2SReSGL8atnsTvhdy9H7h6vD3mM/i27EhFWlTWjXyM1I53GE2/xdlhFcv+8V3j/tgmM+8e/OH02nXvm2j67S0e+yqQV77L9yD4aVKvFp3dNyXkvH104k11xv+Wco0F4LdrUvoz+7zzhrWq47Lkt7zCl0/8xvMVNpJ3/i6c3vgnAa9eOY9aueexNsk1L7hDRAhFhS8Jub4Zbpklx9ssXtVvJF1Wa/IK3Qyg2qcdSvR1CsfIP8vd2CMXmivbR3g6hWP007HO3/dn+8vb7ivx9N7btW15PHyxlDiLSDNsAdIPcxxpjOrg5LqWUUl5ktVtpHvAR8D5QMufLKaWUumRWG4fzxphXPBKJUkqVIiXlSueisjpbaZWIOF+aqZRSqlSxmjmsBRaLSBaQAQhgjDE1Ln6YUkqVLX4+cXOiglltHGZjm866Ax1zUEqpUstq45BojNErqZRSqhAl5WK2orKa+CwUkftEpKqIhGQvHolMKaWU11jNHKbaf76Za5sBys5VQUopVQZYahyMMT4+xKKUUsWjpNxdtagsP91GRGoBnbFlDBuMMUfdHpVSSimvsvqwn/7ALmAocBuwU0T6eiIwpZTyZb7+mFCrmcNE4CpjzEEAEWkIfAEsdXdgSimlvMfqGIJ/dsMAYIz5vQjnUEopVcJZ/WI/LiLDxf74NxG5E8j/MU5KKVWG+fpjQq02DvcB9wJ/ichf9vURbo9KKaWUV1mdyvo7cJWIhGJ7UNAZz4SllFK+zdevkC7KVNZBwA2AEZGvjTEL3R+WUkopb7L6JLhngJuwPfAHYIKINDfGPOf2yJRSyoeVlLGDorKaOdyMbSprOoCIvANsArRxUEqpUsTqgLRkNwwAxpg0bM90UEopVYpYzRx+FJGPgLew3T7jXmCr26NSSikfV1KudC4qq5nDaOAoMBN4HTgOjHJ3UEoppbzL5cxBRPyBfxtjxnkwHqWUKhX8imEqq4g0BuYA1YAU4C5jzN4CylYHfgHWG2NuLuzcLmcOxpgsYJCr5ZVSSnncbOBtY0wT4GXgvYuUfRNY4eqJrY45fC0itxpjPrd4XJnj5192bjnlH1S2nvWUda7sPD7d1/vNvcnT/3YiUgNoA9xo3zQfeF1E6hljYvKUvR04BmwD+rhy/qKMOXwqImkiclxETojIcYvnUEopdREiMkZEYnMtY/IpVhs4aozJBDDGGOAwUCfPuaKAMYClIQGrmUM7i+WVUkpZZIyZBkxzpWie9fzylXeAscaYVLEwDmL13kp/WimvlFJlVTEMSB8BokUkwBiTab9bdm1s2UNunYD37A1DKFBeRFYbY3pc7OQuNQ4icgLnFiqHMaaGK+dRSinlHsaY4yLyEzAM+BDbhKGYvOMNxpiq2b+LyF1AH1dmK7maOWR3J/0bqAq8jS19uQeIc/EcSilVZhTHVFZgJPChiDwJnAbuBBCRFcAzxphtRT2xS41DdneSiHQ1xnTLtWu0iHwPvFTUAJRSShWNMWY/tm6jvNt7FVD+Q2xZRqGsDkhHiUg1Y8xJABGpBkRaPIdSSpV6xZQ5eIzVxuG/wC4RWWZf7wVMdW9ISimlvM3qbKU3RGQ90A3bmMPrxpjdHolMKaWU11h+EhyQAOw0xqwXkQARCTLGnHN3YEop5cv8xLfvkmApehEZCPzI30+Caw4scndQSimlvMtq5vAk0BZYA2CM2SUidd0elVJK+ThfH5C2mvdcMMYk5tmmXUpKKVXKWM0czohITexXS4vItUCy26NSSikf5+uZg9XG4Qls9wOvLyLfAo2Bvu4OSimllHdZncq6TUSuA67GNpV1ozEmxSORKaWU8pqiTGUNBSph61oKwfZoOqWUUrn4ereS1amsQ4CdwGBgCLBTRAZ7IjCllFLeYzVzmAR0MMYcAhCResAqYJ5bo1JKKR/nZ3kyaMliNfqT2Q0DgP2+4SfdGpFSSimvc6lxEJEQEQkBvhaRp0QkQkQiRWQCeoW0UkqVOq52K6ViG4DOHmGZkmufAV51Z1BKKeXrfH1A2tWH/fh255lSSilLijKVVSmlVCF8PXPQjEAppZQTn8kcYmKOM37cHJKT0wgLK8/UF+6gUSPnJ5TOmrWShQs2AdC7T3seesh2d4/du2OY+vwX7NsXS9euLZgx816nY5OSztCv73O0bdso3/3e0qBqFG/1e5zwkEqcOpvK/UteZf/Jww5lgv0Dmd77IVpFNkYQYlLieWDJf0j66zQAz15/L/9o3J6sCxdI+us0Dy37L38kH/VGdQrVqHo0H9z+FOEVKnHqr1Tumfs8vx6LcSgTElSOmTePoW3tywjyD2Tx7u94culbAAxr35OHuw/JKRtduTrrf9/FLe8/WZzVcJsZg8fQr2UX6oVH0uLZ29hz9A9vh1RktStGMKnT/1E5uCKp59KZtOlNDp2Ocyhz5+X9ubHu1TnrtSrWYPHBdUzf8THtajbnwVZDCQksjzGG72K38uauz4u7Gi4pU89z8KZJE//H4MGdWbV6EsOH/4OnJnziVGbr1t9YvnwbixZPYNnyp1n//R42rN8LQPXqlRj/5C2MG3dzga8xZfJndO3a3GN1KKoZvR7mwx0raPvmPczY9AWv9x3jVObutr0JDSrP1bNH0mn2CE6kJvPw1bbrE3s16cTVda+g89v3c83b9/HdoZ08c93dxV0Nl80aPJZ3Ni7h8ueH8sraubwzdLxTmfH/uAOA1i/dwZUvDqNVrSYManUtAJ9sXUW7V+7KWeJPJ/K/7V8Vax3c6csd6+j86ghiEuO9Hcole7LDvSw8uJZBSx/ho71LePqq+5zKzNm7mNtXPsHtK5/gztVPkpmVycqYDQCcOZfGhB9mMnjZo/xr5Xja1LicHvWuKe5qlAk+0TgkJp5h794j9O3XAYAbe7QmLi6RuFjHu4evXLmdAQOuIiQkmKCgQAYO6sTy5dsAiIioQsuW9QgKyj9ZWrr0R8LDw2jfvrFnK2NRtZDKtIxsxOe71wKw+Nf11K0cQZ1KNZ3Klg8IJtAvAH/xo0JQeeJO/30JSpB/IOUCggAICw5x2FeSVA+tTOvoJszdthqABbu+pV54JHWrRjiUa1mrEav2bgYg80IWX+//kWHtejqdr32dZtSsWJWlu9d7PngPWX9wJ3EpJ7wdxiWrEhzGZVXrs/KQ7b1Ye2QLUaE1iKxQvcBjuke351h6EvuSbJdX7U+OIS71OADnLpznQHIM0aHO/xfUpbN6+4ygXL83EJE+IuLv/rAcJcQnU6NGJQIC/LNfm8jIKhyNT3IoF380iaioqjnrtWqFE5+nTH6OH0vhww/WMubR/u4N3A2iw6qTcCaRLHMhZ1vsqeNEV6rhUO6D7cs5cy6dg4/O4+CYeYQFV+DtrYsBWHlgM+tjdnHgkc858MhndKvfmqnfzinWeriqduWaHD19kqwLWTnbjiQfo04Vxy+AbYd/5ZbW1xHoH0BocAg3texK3fCIvKfj7qv6MHfbKjJznU95R80K4Zz4K9nhs3ws7SQRFaoVeEz/htex5Pdv8t0XXq4S19W5ig1xP7k9VnfwEynyUhJYzRx+EJGKIhIOrAfGA2+4P6x85PkHM8YUUEwKLZPX08/M5bHHB1ChQrmix+dBBsd6SD4fnu4N2mCMocm0W2kyfQinMlJ5ouswAFpFNqJJtdo0++9Qmk4fyneHfuKVfz5YLLEXhdPblk99X14zl9iU42wa8y6L7n2JTYd+4XxWpkOZ8oHBDG59Pe9vWubBaJUVTv8nL/JFWDMknFY1mrIyxjnrqxBQnmndn+DjvUvYn3won6PVpbLaOAQYY84AvYE5xphrsN2+O18iMkZEYrOX6dO+LFKQEZFVOJaQTGam7a8/YwwJCSlERVZ1KBcZVZW4uL+7mo4eTSIyT5n87Np5iKcmfML11z3Fyy8vYP36Pfx7+GtFitXdYk+fIKpidfxzDW7VCqtO7KnjDuXuadObZft/ICPrPOcvZDJv9zq61LsSgNuuvJH1MTs5lZGGwfC/n7+mS90ri7UerjqScozoytXx9/s7Ia1duQaHk485lMvIPMejC2fS7pW7uOH1USSln+bXhBiHMoNaXcu+Y386DWYr7ziWlkjNkHCHz3LNkHAS0vLv4uzboDvfx27n9Lk0h+0hAeWYed14vo/dxtx9yz0Z8iUpa5lDsP1nd2Cd/fcL+RcFY8w0Y0x09vLImIIHgy8mPLwizZrVZumSHwH4avVPRNWqSq3ocIdyPXu0YdGizaSnZ3Du3HkWzN9Er95tCz3/5i2vsnbdc6xd9xxjxw6kS5fmvPveqCLF6m4n01P4OeEgt15xPQD9m3XhcMoxDp9y/LKMSUng+gbtctZ7Nu7Ir8djbPuSE+hWvzUB9i/cfza+il9PxBRL/FadSE1hZ+wBbm/XA4CBV3bnz6QE/kxKcChXMTiE8oG2j2O9qpGMvGYA07/5zKHM3R178/5mzRpKiuSM0+xPPsQ/63cB4PraHYlPO0F8Wv7jKX0adGNxni6l8gHBvHbdk2yO/5n3flng8ZjLMqtTWdeJyF77cSNFpAqQWcgxbjF58m2MH/8Rs2evJjS0HC+8aJutMmLEG4we1YcWV9SlQ8cm9OzZlv79ngOgV692dOlim310+PAJ7vjXdM6ePUdGxnm6d3uSESN7cNtt3Yoj/Evy8IoZzOr3GI92HsqZjHTuW/IKAF8MeY6p383hp/jfePG7j5nR52G23PcOBsP+E4d5ePkMAN7ZtoSm1WqzaeTbnMs6z7HUpJx9JdH9817h/dsmMO4f/+L02XTumWt7P5eOfJVJK95l+5F9NKhWi0/vmkLmhSwyL2Tx6MKZ7Ir7LeccDcJr0ab2ZfR/5wlvVcNtXh/yGP1bdiUirCprRr9GakY6jSfe4u2wimTqlneY2On/uLv5TaSd/4tJm94EYEb3cbz18zx+TbJN021fswUiwo8Jux2OH9q0F83DG1LOP5ju0e0BWHt4M+/vWVi8FXFBSckAikpc7ZcHEFtn95XAH8aY0yJSDahtjHFpROiCWev6i/m4Ks+95O0Qik3aybTCC5UiWefKzuB22851vR1Csdp2++du+0b//uhTRf6+6xr1nNdbFquPCTUicgCIFpFo++YM94ellFLKmyw1DiLyCLY7siYD2X8+GaCBm+NSSimf5utXSFsdcxgFNDXGlMz7LiillHILq43DEW0YlFKqcH54fdjgklhtHCaKyLvACuBs9kZjzAq3RqWUUsqrrDYONwF9gSY4jjlo46CUUrn4+lRWq41Df6CeMeYvTwSjlFKqZLA6nP47cN4TgSillCo5rGYOv2G7SnoRjmMOb7o1KqWU8nFlbSprOWzZwxW5tpWZq56VUqqssHqFdMl9fJhSSpUgZWpAWkQCgIeAG7BlDF8DrxljiuXme0oppYqH1W6laUBDYLZ9fThQHxjtzqCUUsrXlanMAdtzHFoZY3vOn4gsA3a4OyillFLeZXU4XfIcI/ZFKaVUKWI1c1gNrBaR97CNOdwFrHR3UEop5evK2lTWscBIYCC2jGEh8La7g1JKKeVdVqeyXgBm2RellFIF8PUBaUt5j4hEicgyEUmzL0tEJNJTwSmllPIOq51is4GNQC37shHtVlJKqVLH6phDbWNM31zrL4rITncGpJRSpYGvP+zHaubgJyIR2SsiUgOdyqqUUqWO1czhFeAnEVmKbSprL2C826NSSikf5+sD0lZnK30sIjuAa7FlDDOMMXs9EplSSimvcblxEBF/YJUx5h/AHs+FpJRSvq84LoITkcbAHKAakALclfcPdhG5FRgHBGLr8XnbGPNaYed2uXEwxmSJjb8xJqvwI5yJj18xaEXFmhW8HUKxaVC/irdDKFb+vt1bYMn2DX96O4Tidbu3A7BsNrYv+w9F5GbgPaBTnjKxwD+NMQkiUgnYLiI7jDE/XOzEVsccNgOLRORjIDV7ozFmhcXzKKWUugT2CUFtgBvtm+YDr4tIPWNMTHa53I2AMeaUiOzDdjdttzYO19h/3p9rmwG0cVBKqVwuZUBaRMYAY3JtmmaMmZanWG3gaPbzdIwxRkQOA3WAmALOezm2zGJEYTFYHZC+1kp5pZRS1tkbgryNQb5F86wX2CKJSDSwGLjPGHO0sBNbzRyw3y6jfu5jjTHfWz2PUkqVZsUwxnoEiBaRAGNMpogItmzisHMsEgWsAZ4zxnzhysmtPiZ0AvA48AeQPShtgA5WzqOUUurSGGOOi8hPwDDgQ2AQEJN7vAFy/qBfC7xkjJnj6vmtZg73AI2MMSctHqeUUmWKn+UbUBTJSOBDEXkSOA3cCSAiK4BnjDHbgCnYxiEeEpGH7MfNMMZ8cLETW20cErRhUEqpksEYsx/nqasYY3rl+v1e4F6r53apcbCPcIPtKXD/AeYCZ3O9uF4lrZRSpYirmcPyPOsDc/1ugAbuCUcppUoHX7/o16XGwRhT39OBKKWUKjmsPglukSvblFKqrPMTvyIvJYHVKOrks62hOwJRSilVcrg6IH0vtsutm4jIj7l2VQL2eyIwpZTyZVI8U1k9xtUB6a+A34BZ2C6Cy3Ya+NndQSmllPIuVwek/wT+BJp5NhyllFIlgdXbZ1QDJgJXAuWytxtj9PYZSimVS0kZWC4qq9G/j+3BERHAs8BxYLW7g1JKKeVdlmcrGWNeAs4aY5ZiuxjuaveHpZRSvk3wK/JSEliN4pz9Z4aIVAUygWj3hqSUUsrbrN54b7+9UfgE2yNDTwE/uT0qpZRSXmX1SXD/sv86Q0S2AVWAlW6PSimlfJyvD0gX5UlwtYAu2G64t8EYk1XIIUoppXyM1XsrDQF2ArcCQ4CfRGSwJwJTSilfJuJX5KUksJo5TAI6GGMOAYhIPWAVMM+tUSmllPIqq43DyeyGAcAYEyMi+mQ4pZTKo5geE+oxLkUvIiEiEgJ8LSJPiUiEiESKyARAb9mtlFKljKuZQyq2AWixr0/Jtc8Ar7ozqGwxMccYN24OycmphFUszwsv3kmjRlFO5Wa9uYIFCzYC0KdPBx56uJ9L+wCSks7Qt88U2rZrxMyZIwF4++1VrFi+LafMkSMnufmWaxg//ha319EV9cMimd59NFXLhXH6XBqPfDuT31JiHco8cOVA+jXsnLNeJ6wmn+5bw5TNHyAIEzreQffabQjw82Nbwj7Gb5jN+QuZxV0Vl9SpGMGUq/+PysEVOXMunYmb3uSPU3EOZe5u3p8edf++/rJWaA0W/b6O/2z/OGdbkF8gn/Z6kbNZGdy+8slii9+K2hUjmNTJVtfUc+lM2vQmh0471vXOy/tzY+66VqzB4oPrmL7jY9rVbM6DrYYSElgeYwzfxW7lzV2fF3c13GbG4DH0a9mFeuGRtHj2NvYc/cPbIZVZrt54zyv50cRn/sfgwZ0ZOPBqVq3azoQJH/P55084lNm69TeWL9/K4iVPExDgx9Ahr9CmbUO6dGl+0X3ZJk/6lK7dWpCWlvNIbEaM6MmIET0BOHcuk65dnqBvX+/dPurFLvczd99XfHHgG3rX78Sr3R6k/+JxDmXe2LWAN3YtACDQL4Dtw95j4cHvABh62Q00C6/HPxc8yvkLmbza9QGGt+jDWz+XzKRvQsd7mf/bWpb+8R031OnIxKvu487VTzuU+WDPYj7YsxiAAD9/vhr4FisObXAo82CrIfx88gBNqtQtttiterLDvSw8uJZlf3zH9bU78vRV93HPV451nbN3MXP2/l3XVQPeYmWMra5nzqUx4YeZxKUeJ8gvkDevf4oe9a5hdcwPxV4Xd/hyxzpe/upjNjz2trdDuWQlZWC5qEps9ImJp9m79zD9+nUEoEePNsTFJhIb6zjEsXLFNgYM6ERISDBBQYEMGnQ1y5dvLXQfwNIlWwivVpH27RsXGMfaNTuJiKhCixbe+YIJL1eJFtUasOA32xf98kObqF2xBtGh1Qs8pke9DsSnJrL7pO2vrsvD67EhbldOprDuyHYGNe7m+eCLoEpwGM2q1mfFofUArDm8hajQGkRWKLi+10a353h6Er8m5QyH0br6ZdQJi2CZ/TwlUZXgMC6rWp+V9hjXHim8rt2j23MsPYl99rruT44hLvU4AOcunOdAcgzRoTU9H7yHrD+4k7iUE94OQ1GExkFE/O2zlDwqPj6ZGjUqERDgn/26REZWIT4+2aHc0fgkoqKq5qzXig4n/mhyofuOHUvhgw/X8uijAy4ax5fzNzLo5mvcUqeiiAoN51h6ElnmQs62o6knqXWRxmFI0xv4bP+anPWdxw9yY90OVAgsR6BfAP0adiG6Yg2Pxl1UERXCOZGe7FDfhLSTRFaoVuAxNzW6jkW/f5OzXs4/mMfa3cnzW971aKyXqmaFcE785VjXY2knibhIXfs3vI4lueqaW3i5SlxX5yo2xOlNC0qCMvWYUBHpgu25Dt/b19uLyMcXKT9GRGKzl2nTvrAUnIg4rBtTeLm8ZQra98zTn/D44wOpUKEcBYmPT2LH9oNe7VICMAVVPB+RFcLpENGMhQe/z9n25W/f8F3sTub3fZ7P+0zhQPJhMi+U3GsXDY71zfs5yK1mSDitazTNyTQAHmkzjHkHVnPir+QCjyspnN7bQuraqkZTVsY4Z0MVAsozrfsTfLx3CfuTD+VztFLWWJ3K+jLQDfgSwBizVUTaFFTYGDMNmJazzjcuf8tFRlYhISGZzMwsAgL8McaQkJBMZGQVh3JRkVWJi0vMWT8al0hkVJVC9+3c+QcTJnwEQHpaBhkZ5xk+fCbvvTc6p/yC+Zu49rqWVK5cwdWw3e5oaiKRoeH4i1/OX5hRodWIS80/9b616fV89edWUjJSHbb/d8c8/rvDdjlKv4adOZB8xLOBF1FCWiI1QhzrWzMknPi0/GdM92/Yne9it3P6XFrOtlY1mtK5VitGXDGIIP8gwoIq8GWfV7l52WPFUgdXHUtLpGY+dU0ooK59G3Tn+zx1BQgJKMfM68bzfew25u5b7umwlYsEf2+HcEms5i8Bxpjf82w7l2/JSxQeHkazy2uzZMkWAFav3kGtWuFERzum3D16tmHRos2kp2dw7tx55s/fSO9e7Qvdt+XHaaxbN5V166Yy9olBdOna3KFhMMawcOEmbvZilxJA4tlT7Dl5iIH2MYLe9TsRe+Y4sQU0Drc0udahSwkg2D+QsKAQAKoEV+SBKwcya9dCzwZeRMkZp9mffIhe9bsAcEOdjhxNPUF8Wv717dugG4sOOnaz3Lp8LL0XjaL3olGM2zCDgymHS1zDAH/X9Z/2ul5fuyPxaQXXtU+DbizO06VUPiCY1657ks3xP/PeLws8HrMqO6xmDmdFJBTb9FVEpDlw9uKHFN3kybczfvwcZs9eRWiFcrz40l0AjLj3NUaN7scVV9SlY8em9OzZln59nwWgV+92dOlqm410sX2F2bx5P8YYOnW6zP0Vs+iJ9bOY3n00o1rdzJnz6Tzy7UwAPur5FK9u+5SfT9ra62uirkAQNsQ5Pta7YlAIX/Z9jqwLF/D38+Pd3ctYc3ib0+uUFM9teYcpnf6P4S1uIu38Xzy98U0AXrt2HLN2zWNvkm2gvUNEC0SELQm7vRnuJZm65R0mdvo/7m5uq+ukTba6zug+jrd+nsev9rq2r2mr64956jq0aS+ahzeknH8w3aNtf/isPbyZ9/eUzMa/MK8PeYz+LbsSEVaVNaNfIzUjncYTvTOFvKwTK/3ZInIjtseENsR224yewDBjzJqLHmhnpVvJ19V+e6a3Qyg21UOCvB1CsfIveFig1Nm+4U9vh1CszKzNbnt3kzM+L/L3XZXgW73+KbN6y+6vROQ3bI2CAM8ZYw56JDKllFJeY6lxEJE6QLwxZpZ9vbyI1DbGlMzRTaWU8pKS8rjPorIa/ZcublNKKeXDrA5IBxljcgagjTF/iUiwm2NSSimfV1IuZisqq9EbEcm5tFZEavL3zfiUUkqVElYzh5nABhH5yL5+B/Cce0NSSinlbVZnK30gIoeAXvZNw40xJffOZkop5SW+fldWq5kDxphvgW/dHolSSqkSw6XGQUReMsY8ISJfAE4XdhhjBrs9MqWU8mG+/phQVzOH7KeoLPNUIEoppUoOV58Et1RE/IHLjTFPFHqAUkqVcb4+5uBy9MaYLMC7DzZQSilVLKw2bUtF5AkRqSEiIdmLRyJTSinlNVZnK71q//lCrm0GfPypFkop5Wa+foW01escfLu2SimlXGL5OgcRqQV0wZYxrDfGHHV7VEop5ePK1F1ZRWQIsBO4FRgC7BQRvcZBKaVKGauZwySggzHmEICI1MP2RLh5bo1KKaWUV1ltHE5mNwwAxpgYETnp5piUUsrn+fqAtNXovxaRp0QkQkQiRWQCsEintCqlVOliNXOYaP85Jc/2l9EprUoplcPXB6R1KqtSSiknlqeyKqWUKlxZG3NQSilVBmjjoJRSyokY4/TsnlJFRMYYY6Z5O47iUpbqW5bqClpfVbzKQuMQa4yJ9nYcxaUs1bcs1RW0vqp4abeSUkopJ9o4KKWUclIWGoey1mdZlupbluoKWl9VjEr9mINSSinrykLmoJRSyiJtHJRSSjnRxkEppZQTbRyU24mIEZHQSzxHvZL8rJCi1NFepxF5tsWISAv3RufbRKSfiLzi7TjKuhLROLjjy8RTRGSFiDQsYN+3ItKnuGPKj4iUmZso+nBd6wEjCiuUHx+usyUiEmCMWWKMedzbsZR1JaJxy//zyAAAB1pJREFUKMmMMb2MMb97MwYR+UREtonIzyKyTERqiEh3EdkpIjNFZBMwQEQqisg7IvKjvexbIhJoP8cYEdkqIj/Z93f0cNiPicgPInJARIbmqkt7EVlnr88OERmUa98DInJQRNYD/861vZ6InBSRZ+z7RolIqIi8LyK/2JeJuco3EpE19n+DnSJyU659RkTG2/8N/hCRG0TkBfu/yx4RaW4v19ge/y4R2S0iz1moo9P7Zd/1FnC5PaYluc4zSEQ2isghEXkq13m+FZHnRWQtsPr/2zv3GKuLK45/vjyWCvWxRiyWbrJBiZRopQklRFCwTbVabTWWpEUQEG21LfVRtRQibdKWtrFJYyUkWqKkrY0mkhQUVJCouFIwvHbZ8oiELqVWg9iWVwSjnP5xzoXhPnd1l93qfJJfduZ35jdz5sz85szj7r1xb3Lo0yJpqaTBcb+3pN8k9nhQUl3IFkZfWClpl6QHJF0maVWsXO6KdL0kzZO0Leq9XtInOtDmFQm7/7SCvUzSDyW9CPxS0lRJTybyaWGz5rBrY9y/QlJT6LlW0qWdoWsmMLNuv/AfCvoRsBb4OzAtkY0E/gq0AK8CY+J+I/6zpYV0n/TqGMApwBPAFqAZWJ6kmxzlbABeAi6ooVtbIQ0wPHn2MWANcPVJsM9ZSXgmMA8YDxwFxiayh4HJERawALgz4gOTdKOB1i5uz59EeAiwF2gAzgjbnVOoF7ALGAR8DvgX8KmQzS+0b7S1AROTMn4N/Amf4AwANgITQrYW+HaEhwJvAw2Jbt+L8ATgEPDViN8L/DnCDwCzkvLObE8dK7VXhMcD68r0r98W2gjYBwyO+IvAUqBvxC8A3kzks4GlEb4NeAHoh38V/zLgnpAtBJpC1h/YAzwSthsMHMTfn88DW4Fe8dzphXBX9YlEltp6KvBkYrMdSZ/pH9cQYDVwWtw/D3i9YKt8dUKbdbcCSee4PcKfBQ5EB68D/gFcEbKxwBsxGDRS2Tlcx4kO4cz4OyZetn4RvwRorqFbG8edw3pgSoRHA+9zcpzD7cA6YHO8KE3x0mwrSrcHd6Kb4toOzA/Z5bgzbA3ZUaCuC9tzcBL/CzARuAr4b6LfpmjfccAPgN8nz4zgROfwDvF/OUlbpI7xTuAh4FTgCNA7kS0GvpXodlaEzwUOJOm+BKyJ8PXATuAXYbte7aljpfaK++Mp7xxGJvGNhXrhzuGbiWwGsCCJ1xfsAiwCJiWy64DnIrwQuDuRvQx8I4nvBobhzmBHpJ9C4uS6qk8kskGJbCrHncP9wJwy+X0X7+9pX3odGNLV7+PH5epJ+5iPAZjZVknv4bPJeuBdM3suZE2S9uCzzDeq5NUMDJM0Hx8Ql8X9rwMXAWslFdIOlFRnZu9WU07SafjM7Y+hyxpJmztezY4haSzwfeBiM3tL0teAOSE+WJwcuNbMdhblUYcPHuPNbH3UZR/ufKvWuxOx0K/FzEqW/5JG1Hj+kMWoUHgk8ixXBhVkBQ7H3/dxR0IS7wNgZoskrQa+jNv/Dty5VcNqtFclDifhYzoEaRsX17k99qhURkmZZrYvttXGAZfhWzyXmtmOGvp/UFL9ivtyLQQ8a2Y3dqI+mYSedOZQ7gUp1+GJe+9x4m9WH9sbjcFxOPAsvlpolVQf+T1iZiOS69O1HENRuSebemA/8O8Y5L9TJe0SYKbi8FJSvaTzcNv0xWeI4DPQruam0KERX/E14dsAQyV9sZBI0oio1wvAVcn+/PQa+a8AbpEzAJgEPG9m+/FZ5JTI/1y8D7zSEeUlDQX2mNkf8O2m0e2sY7X22o/Pzj8oK3EbDYr4rcDKcJorgKmS6qL9pwPPdyRzSQOBAWa2HJiFr2qGfwh9iylnr1o8BdxYqLOk/pL6A8uBryj5pJekUZ2o68eenuQcyrEN6FcYTCRdDJyNL9ffBPpIOj/SHptBSPoMvsW0BLgbdwoNHO9oDZGul6SR7VEkBp1W4IZ4dhRw4YeuYW2ewZf62/BDyU1V0t6BO81NklrwwaExdJ8DvCppFSfOlruKI5JewV/iGWa228z+A1wD3BeHi1uAX+FbNi3AXGC1pCb8/KEaP8Od9Wb8jGGJmRUOMW8AJklqxldMN5vZ7vLZVGQC0CJpI/A4PhDXrCPV26sF2B4HxktKcquBmf0N+DGwPNr3Eo47n4fxFfOGKLMN+F0Hi2gAVkTem/H+/kxH9axCOXtVxcxWAT/H69yM7wQMNLPX8AnBguhLW/HtvEwn0SO+W0mSAaea2cGI78X3YdskfQHv5APw1cVdZtYU6aYB9wH/xDvxXDOTpCvxQUe4A1xsZrPjmYm4w+iNz6aXWpWPzUlqw88VWiUNBx6N5zbgs6q5ZvZ0pxokk/mIUfyOZ3o+PcI5ZDKZjzbZOfz/kZ1DJpPJZEroSZ9W6jYk3Yx/wqSYGWb28snWJ5PJZLqbvHLIZDKZTAk9/dNKmUwmk+kGsnPIZDKZTAnZOWQymUymhOwcMplMJlNCdg6ZTCaTKeF/+Z8zNHzc+m4AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 480x400 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 通过热力图可以看出 area，bedrooms，bathrooms 等变量与房屋价格 price 的关系都还比较强\n",
    " ## 所以值得放入模型，但分类变量 style 与 neighborhood 两者与 price 的关系未知\n",
    "heatmap(data=df, figsize=(6,5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 刚才的探索我们发现，style 与 neighborhood 的类别都是三类，\n",
    " ## 如果只是两类的话我们可以进行卡方检验，所以这里我们使用方差分析\n",
    "    \n",
    "## 利用回归模型中的方差分析\n",
    "## 只有 statsmodels 有方差分析库\n",
    "## 从线性回归结果中提取方差分析结果\n",
    "import statsmodels.api as sm\n",
    "from statsmodels.formula.api import ols # ols 为建立线性回归模型的统计学库\n",
    "from statsmodels.stats.anova import anova_lm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "插播一条样本量和置信水平 α_level 的注意点（置信水平 α 的选择经验）\n",
    "\n",
    "    样本量            α-level\n",
    "    ≤ 100              10%\n",
    "    100 ＜ n ≤ 500      5%\n",
    "    500 ＜ n ≤ 1000     1%\n",
    "    n ＞ 2000          千分之一\n",
    "    \n",
    "样本量过大，α-level 就没什么意义了。\n",
    "\n",
    "数据量很大时，p 值就没用了，样本量通常不超过 5000，\n",
    "\n",
    "为了证明两变量间的关系是稳定的，样本量要控制好。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>df</th>\n",
       "      <th>sum_sq</th>\n",
       "      <th>mean_sq</th>\n",
       "      <th>F</th>\n",
       "      <th>PR(&gt;F)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C(neighborhood)</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.684418e+13</td>\n",
       "      <td>8.422089e+12</td>\n",
       "      <td>121.007384</td>\n",
       "      <td>8.048111e-45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C(style)</th>\n",
       "      <td>2.0</td>\n",
       "      <td>3.514449e+13</td>\n",
       "      <td>1.757225e+13</td>\n",
       "      <td>252.475532</td>\n",
       "      <td>4.075881e-80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Residual</th>\n",
       "      <td>595.0</td>\n",
       "      <td>4.141188e+13</td>\n",
       "      <td>6.959979e+10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    df        sum_sq       mean_sq           F        PR(>F)\n",
       "C(neighborhood)    2.0  1.684418e+13  8.422089e+12  121.007384  8.048111e-45\n",
       "C(style)           2.0  3.514449e+13  1.757225e+13  252.475532  4.075881e-80\n",
       "Residual         595.0  4.141188e+13  6.959979e+10         NaN           NaN"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据集样本数量：6028，这里随机选择 600 条，如果希望分层抽样，可参考文章：\n",
    "df = df.copy().sample(600)\n",
    "\n",
    "# C 表示告诉 Python 这是分类变量，否则 Python 会当成连续变量使用\n",
    "## 这里直接使用方差分析对所有分类变量进行检验\n",
    "## 下面几行代码便是使用统计学库进行方差分析的标准姿势\n",
    "lm = ols('price ~ C(neighborhood) + C(style)', data=df).fit()\n",
    "anova_lm(lm)\n",
    "\n",
    "# Residual 行表示模型不能解释的组内的，其他的是能解释的组间的\n",
    "# df: 自由度（n-1）- 分类变量中的类别个数减1\n",
    "# sum_sq: 总平方和（SSM），residual行的 sum_eq: SSE\n",
    "# mean_sq: msm, residual行的 mean_sq: mse\n",
    "# F：F 统计量，查看卡方分布表即可\n",
    "# PR(>F): P 值\n",
    "\n",
    "# 反复刷新几次，发现都很显著，所以这两个变量也挺值得放入模型中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 多元线性回归建模"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>          <td>price</td>      <th>  R-squared:         </th> <td>   0.605</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.603</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   304.3</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 01 Jul 2020</td> <th>  Prob (F-statistic):</th> <td>9.38e-120</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>03:11:57</td>     <th>  Log-Likelihood:    </th> <td> -8304.0</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   600</td>      <th>  AIC:               </th> <td>1.662e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   596</td>      <th>  BIC:               </th> <td>1.663e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     3</td>      <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th> <td> 7.706e+04</td> <td> 2.97e+04</td> <td>    2.592</td> <td> 0.010</td> <td> 1.87e+04</td> <td> 1.35e+05</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>area</th>      <td>  224.8632</td> <td>   20.179</td> <td>   11.143</td> <td> 0.000</td> <td>  185.233</td> <td>  264.494</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>bedrooms</th>  <td> 4.351e+04</td> <td> 2.83e+04</td> <td>    1.537</td> <td> 0.125</td> <td>-1.21e+04</td> <td> 9.91e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>bathrooms</th> <td> -1.26e+04</td> <td> 3.86e+04</td> <td>   -0.327</td> <td> 0.744</td> <td>-8.83e+04</td> <td> 6.31e+04</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>116.473</td> <th>  Durbin-Watson:     </th> <td>   1.933</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td> 185.115</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 1.330</td>  <th>  Prob(JB):          </th> <td>6.35e-41</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 3.575</td>  <th>  Cond. No.          </th> <td>1.19e+04</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 1.19e+04. This might indicate that there are<br/>strong multicollinearity or other numerical problems."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                  price   R-squared:                       0.605\n",
       "Model:                            OLS   Adj. R-squared:                  0.603\n",
       "Method:                 Least Squares   F-statistic:                     304.3\n",
       "Date:                Wed, 01 Jul 2020   Prob (F-statistic):          9.38e-120\n",
       "Time:                        03:11:57   Log-Likelihood:                -8304.0\n",
       "No. Observations:                 600   AIC:                         1.662e+04\n",
       "Df Residuals:                     596   BIC:                         1.663e+04\n",
       "Df Model:                           3                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "Intercept   7.706e+04   2.97e+04      2.592      0.010    1.87e+04    1.35e+05\n",
       "area         224.8632     20.179     11.143      0.000     185.233     264.494\n",
       "bedrooms    4.351e+04   2.83e+04      1.537      0.125   -1.21e+04    9.91e+04\n",
       "bathrooms   -1.26e+04   3.86e+04     -0.327      0.744   -8.83e+04    6.31e+04\n",
       "==============================================================================\n",
       "Omnibus:                      116.473   Durbin-Watson:                   1.933\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              185.115\n",
       "Skew:                           1.330   Prob(JB):                     6.35e-41\n",
       "Kurtosis:                       3.575   Cond. No.                     1.19e+04\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 1.19e+04. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from statsmodels.formula.api import ols\n",
    "\n",
    "lm = ols('price ~ area + bedrooms + bathrooms', data=df).fit()\n",
    "lm.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型优化\n",
    "发现精度还不够高，这里通过添加虚拟变量与使用方差膨胀因子检测多元共线性的方式来提升模型精度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6005</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      A  B\n",
       "6005  0  0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 设置虚拟变量\n",
    "# 以名义变量 neighborhood 街区为例\n",
    "nominal_data = df['neighborhood']\n",
    "\n",
    "# 设置虚拟变量\n",
    "dummies = pd.get_dummies(nominal_data)\n",
    "dummies.sample()  # pandas 会自动帮你命名\n",
    "\n",
    "# 每个名义变量生成的虚拟变量中，需要各丢弃一个，这里以丢弃C为例\n",
    "dummies.drop(columns=['C'], inplace=True)\n",
    "dummies.sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>house_id</th>\n",
       "      <th>neighborhood</th>\n",
       "      <th>area</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>style</th>\n",
       "      <th>price</th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4077</th>\n",
       "      <td>3930</td>\n",
       "      <td>A</td>\n",
       "      <td>3354</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>victorian</td>\n",
       "      <td>844499</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2651</th>\n",
       "      <td>2965</td>\n",
       "      <td>C</td>\n",
       "      <td>2088</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>victorian</td>\n",
       "      <td>530379</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4904</th>\n",
       "      <td>2386</td>\n",
       "      <td>B</td>\n",
       "      <td>3100</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>victorian</td>\n",
       "      <td>1541295</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      house_id neighborhood  area  bedrooms  bathrooms      style    price  A  \\\n",
       "4077      3930            A  3354         5          3  victorian   844499  1   \n",
       "2651      2965            C  2088         4          2  victorian   530379  0   \n",
       "4904      2386            B  3100         6          4  victorian  1541295  0   \n",
       "\n",
       "      B  \n",
       "4077  0  \n",
       "2651  0  \n",
       "4904  1  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将结果与原数据集拼接\n",
    "results = pd.concat(objs=[df, dummies], axis='columns')  # 按照列来合并\n",
    "results.sample(3)\n",
    "# 对名义变量 style 的处理可自行尝试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>          <td>price</td>      <th>  R-squared:         </th> <td>   0.920</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.919</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   1364.</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 01 Jul 2020</td> <th>  Prob (F-statistic):</th> <td>8.40e-323</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>03:11:57</td>     <th>  Log-Likelihood:    </th> <td> -7825.3</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   600</td>      <th>  AIC:               </th> <td>1.566e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   594</td>      <th>  BIC:               </th> <td>1.569e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     5</td>      <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th> <td>-1.511e+05</td> <td> 1.52e+04</td> <td>   -9.912</td> <td> 0.000</td> <td>-1.81e+05</td> <td>-1.21e+05</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>area</th>      <td>  265.0309</td> <td>    9.140</td> <td>   28.997</td> <td> 0.000</td> <td>  247.080</td> <td>  282.982</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>bedrooms</th>  <td> 3.894e+04</td> <td> 1.28e+04</td> <td>    3.047</td> <td> 0.002</td> <td> 1.38e+04</td> <td>  6.4e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>bathrooms</th> <td>-1.373e+04</td> <td> 1.74e+04</td> <td>   -0.788</td> <td> 0.431</td> <td>-4.79e+04</td> <td> 2.05e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>A</th>         <td> 8007.9868</td> <td> 1.13e+04</td> <td>    0.711</td> <td> 0.477</td> <td>-1.41e+04</td> <td> 3.01e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>B</th>         <td>  4.81e+05</td> <td> 1.13e+04</td> <td>   42.698</td> <td> 0.000</td> <td> 4.59e+05</td> <td> 5.03e+05</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>38.602</td> <th>  Durbin-Watson:     </th> <td>   1.991</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.000</td> <th>  Jarque-Bera (JB):  </th> <td>  70.673</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.426</td> <th>  Prob(JB):          </th> <td>4.50e-16</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 4.450</td> <th>  Cond. No.          </th> <td>1.20e+04</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 1.2e+04. This might indicate that there are<br/>strong multicollinearity or other numerical problems."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                  price   R-squared:                       0.920\n",
       "Model:                            OLS   Adj. R-squared:                  0.919\n",
       "Method:                 Least Squares   F-statistic:                     1364.\n",
       "Date:                Wed, 01 Jul 2020   Prob (F-statistic):          8.40e-323\n",
       "Time:                        03:11:57   Log-Likelihood:                -7825.3\n",
       "No. Observations:                 600   AIC:                         1.566e+04\n",
       "Df Residuals:                     594   BIC:                         1.569e+04\n",
       "Df Model:                           5                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "Intercept  -1.511e+05   1.52e+04     -9.912      0.000   -1.81e+05   -1.21e+05\n",
       "area         265.0309      9.140     28.997      0.000     247.080     282.982\n",
       "bedrooms    3.894e+04   1.28e+04      3.047      0.002    1.38e+04     6.4e+04\n",
       "bathrooms  -1.373e+04   1.74e+04     -0.788      0.431   -4.79e+04    2.05e+04\n",
       "A           8007.9868   1.13e+04      0.711      0.477   -1.41e+04    3.01e+04\n",
       "B            4.81e+05   1.13e+04     42.698      0.000    4.59e+05    5.03e+05\n",
       "==============================================================================\n",
       "Omnibus:                       38.602   Durbin-Watson:                   1.991\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               70.673\n",
       "Skew:                           0.426   Prob(JB):                     4.50e-16\n",
       "Kurtosis:                       4.450   Cond. No.                     1.20e+04\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 1.2e+04. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 再次建模\n",
    "lm = ols('price ~ area + bedrooms + bathrooms + A + B', data=results).fit()\n",
    "lm.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 模型末尾提示可能存在多元共线性，需要处理一下"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 自定义方差膨胀因子的检测公式\n",
    "def vif(df, col_i):\n",
    "    \"\"\"\n",
    "    df: 整份数据\n",
    "    col_i：被检测的列名\n",
    "    \"\"\"\n",
    "    cols = list(df.columns)\n",
    "    cols.remove(col_i)\n",
    "    cols_noti = cols\n",
    "    formula = col_i + '~' + '+'.join(cols_noti)\n",
    "    r2 = ols(formula, df).fit().rsquared\n",
    "    return 1. / (1. - r2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "area \t 5.165722122582504\n",
      "bedrooms \t 19.637187647252347\n",
      "bathrooms \t 17.577810197393287\n",
      "A \t 1.3069170166627762\n",
      "B \t 1.3401360368558675\n"
     ]
    }
   ],
   "source": [
    "test_data = results[['area', 'bedrooms', 'bathrooms', 'A', 'B']]\n",
    "for i in test_data.columns:\n",
    "    print(i, '\\t', vif(df=test_data, col_i=i))\n",
    "# 发现 bedrooms 和 bathrooms 存在强相关性，可能这两个变量是解释同一个问题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>          <td>price</td>      <th>  R-squared:         </th> <td>   0.919</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.918</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   1680.</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 01 Jul 2020</td> <th>  Prob (F-statistic):</th> <td>1.88e-322</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>03:11:57</td>     <th>  Log-Likelihood:    </th> <td> -7830.0</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   600</td>      <th>  AIC:               </th> <td>1.567e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   595</td>      <th>  BIC:               </th> <td>1.569e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     4</td>      <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>    \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th> <td>-1.255e+05</td> <td> 1.28e+04</td> <td>   -9.802</td> <td> 0.000</td> <td>-1.51e+05</td> <td>   -1e+05</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>area</th>      <td>  274.6316</td> <td>    8.639</td> <td>   31.789</td> <td> 0.000</td> <td>  257.664</td> <td>  291.599</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>bathrooms</th> <td> 3.202e+04</td> <td> 8895.244</td> <td>    3.600</td> <td> 0.000</td> <td> 1.46e+04</td> <td> 4.95e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>A</th>         <td> 9522.8912</td> <td> 1.13e+04</td> <td>    0.841</td> <td> 0.401</td> <td>-1.27e+04</td> <td> 3.18e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>B</th>         <td> 4.819e+05</td> <td> 1.13e+04</td> <td>   42.502</td> <td> 0.000</td> <td>  4.6e+05</td> <td> 5.04e+05</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>28.641</td> <th>  Durbin-Watson:     </th> <td>   1.965</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.000</td> <th>  Jarque-Bera (JB):  </th> <td>  48.430</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.345</td> <th>  Prob(JB):          </th> <td>3.05e-11</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 4.209</td> <th>  Cond. No.          </th> <td>8.84e+03</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 8.84e+03. This might indicate that there are<br/>strong multicollinearity or other numerical problems."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                  price   R-squared:                       0.919\n",
       "Model:                            OLS   Adj. R-squared:                  0.918\n",
       "Method:                 Least Squares   F-statistic:                     1680.\n",
       "Date:                Wed, 01 Jul 2020   Prob (F-statistic):          1.88e-322\n",
       "Time:                        03:11:57   Log-Likelihood:                -7830.0\n",
       "No. Observations:                 600   AIC:                         1.567e+04\n",
       "Df Residuals:                     595   BIC:                         1.569e+04\n",
       "Df Model:                           4                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "Intercept  -1.255e+05   1.28e+04     -9.802      0.000   -1.51e+05      -1e+05\n",
       "area         274.6316      8.639     31.789      0.000     257.664     291.599\n",
       "bathrooms   3.202e+04   8895.244      3.600      0.000    1.46e+04    4.95e+04\n",
       "A           9522.8912   1.13e+04      0.841      0.401   -1.27e+04    3.18e+04\n",
       "B           4.819e+05   1.13e+04     42.502      0.000     4.6e+05    5.04e+05\n",
       "==============================================================================\n",
       "Omnibus:                       28.641   Durbin-Watson:                   1.965\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               48.430\n",
       "Skew:                           0.345   Prob(JB):                     3.05e-11\n",
       "Kurtosis:                       4.209   Cond. No.                     8.84e+03\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 8.84e+03. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 果然，bedrooms 和 bathrooms 这两个变量的方差膨胀因子较高，\n",
    " # 也印证了方差膨胀因子大多成对出现的原则，这里我们丢弃膨胀因子较大的 bedrooms 即可\n",
    "lm = ols(formula='price ~ area + bathrooms + A + B', data=results).fit()\n",
    "lm.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "area \t 4.510963576907545\n",
      "bathrooms \t 4.510963576907545\n"
     ]
    }
   ],
   "source": [
    "# 再次进行多元共线性检测\n",
    "test_data = df[['area', 'bathrooms']]\n",
    "for i in test_data.columns:\n",
    "    print(i, '\\t', vif(df=test_data, col_i=i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 精度没变，但具体问题还是需要结合具体业务来分析"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
